set -x


export VLLM_ENGINE_ITERATION_TIMEOUT_S=36000
export VLLM_RPC_TIMEOUT=36000000
export VLLM_W8A8_MOE_USE_W4A8=1
export VLLM_ENFORCE_CUDA_GRAPH=1

model_id=/home/DeepSeek-R1-int4-pack8/

export VLLM_PP_LAYER_PARTITION="16,16,16,13"

vllm serve $model_id \
--pipeline-parallel-size 4 \
--tensor-parallel-size 4 \
--trust-remote-code \
--max-model-len $[1024*4]  \
--gpu-memory-utilization 0.9 \
--compilation_config='{"level":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' \
--disable_log_requests \
#--disable_log_stats